#/usr/bin/perl
use POSIX "fmod"; # float modulus
use List::Util qw[min]; # min function
# GLOBAL
my $grid_size_x = 300;
my $grid_size_y = 300;
my $locitot = 99;
my $name = "sim";
my $vertical_scaling_factor = 1.5; # for hex grid is 0.75, we'll take modulus of 1.5
my $biopsy_size_x = 20; # size of biopsy, 10x10 = 100 cells maximum
my $biopsy_size_y = 10; # biopsies are 1mm by 2mm correspond to 10 crypts by 20 crypts


# Draws random n locations
sub draw_random_locations{
    my ($n,$leftx,$rightx,$topy,$boty) = @_;
    my %h=();
    my $size=0;
    while($size<$n){
        $x = $leftx+int(rand($rightx-$leftx+1)); 
        $y = $topy+int(rand($boty-$topy+1)); 	
	# Convert abs to hex
	my ($hx, $hy) = abs_to_hex_coords($x, $y);
	# Ensure we pick out n DIFFERENT cells
	# do not pick out the same cell twice
        if(!$h{$hx}{$hy}){
            $h{$hx}{$hy}=1;
	    $size++;
        }
    }
    return %h;
}

# convert coords
sub abs_to_hex_coords{
    my ($x, $y) = @_;    
    $y = $y*($vertical_scaling_factor/2.0);
    # Depending on picking y that is divisible by 1.5 or not, add 0.5 to x                                         
    if(fmod($y,$vertical_scaling_factor)){
	$x = $x + 0.5;
    }
    return ($x, $y);
}

sub hex_to_abs_coords{
    my ($x, $y) = @_;    
    # Depending on picking y that is divisible by 1.5 or not, add 0.5 to x                                         
    if(fmod($y,$vertical_scaling_factor)){
	$x = $x - 0.5;
    }
    $y = $y/($vertical_scaling_factor/2.0);
    return ($x, $y);
}

=pod
my %hb = draw_random_locations(10,1,5,1,5);
for my $kx (keys %hb){
    for my $ky (keys %{$hb{$kx}}){
	print $kx." ".$ky."\n";
    }
}    
die;
=cut


# Start from sweep 1
# do not use my, so that these hashes are global
 %grid;
 %patterns;
 %indexx, %indexy; 
for $sw ($ARGV[0]..$ARGV[1]){

    # Seed the random number generator with simulation number
    srand ($sw);

    my %grid_coords_x =();
    my %grid_coords_y =();
    open (in,"$name$sw/$name$sw.crypts");
    while(<in>){
	chomp;
	@row=split(/\,/);
	$grid_coords_x{$row[0]}=$row[1];
	$grid_coords_y{$row[0]}=$row[2];
    }
    close in;

    %patterns=();

    # Load all patterns in memory
    open (in,"$name$sw/$name$sw.patterns");
    while(<in>){
	chomp;
	@row=split(/\,/);
	$patterns{$row[0]}="";
	for my $loc (0..$locitot){
	    # Hash only the locations that are different from 
	    # the wild type "0" microsattelite length	    
	    # Shaka Zulu proposes hashing only the clone_id key
	    # and as value concatenate "locus,value,locus,value"
	    if($row[$loc+1]!=0){
		# Leading comma ",loc,val,loc,val"
		$patterns{$row[0]}=$patterns{$row[0]}.",".$loc.",".$row[$loc+1];
	    }
	}
    }
    close in;

    
    # Go through the log file of grid state
    %grid=();
    open (i1,"$name$sw/$name$sw.log");
    open (out, ">$name$sw/$name$sw.divergence.biopsy.sampling.log");
    open (out2, ">$name$sw/$name$sw.divergence.biopsy.sampling.locations.log");
    while(<i1>){
	chomp;
	@row=split(/\,/);
	# At day 7300, get the grid state
	if($row[2]==7300){
	    # Important - throw it as a double in the hash
	    $grid{$grid_coords_x{$row[0]}*1.0}{$grid_coords_y{$row[0]}*1.0}=$row[1];
	}
    }    
    close i1;    


    # Compute divergence as a function of 
    # A. number of biopsies
    # B. number of cells sampled within a biopsy

    my $cryptsam;
    for $i (2..20){
	$cryptsam[$i] = $i;
    }
    $cryptsam[21] = 50;
    $cryptsam[22] = 100;

    # take 1,2,3,4,5,6,7,8,9,10 biopsies
    # take 1,2,3,4,5 cells per biopsy
    # 20 replicates, 10 biopsies, 20 cells
    # 4000 combinations
    # instead of 96, 213 = 224.25 is the maximum 
    for my $replicate (1..10){
	for my $biopsy (1..10){
	    for my $cryptind (2..22){
		$crypt = $cryptsam[$cryptind];
		# Choose the locations of biopsies randomly, but make sure no two biopsies overlap
		# Sample wifout replacement
		my $biopsy_count = 0;
		my %biopsy_coords = ();
		while($biopsy_count<$biopsy){
		    # Pick a biopsy
		    %biopsyhash = ();
		    %biopsyhash = draw_random_locations(1,0,$grid_size_x-1-$biopsy_size_x,0,$grid_size_y-1-$biopsy_size_y);
		    # Get x, y of biopsy (hex coords)
		    for my $kx (keys %biopsyhash){
			for my $ky (keys %{$biopsyhash{$kx}}){
			    $bx = $kx;
			    $by = $ky;
			}
		    }   
		    my ($bx, $by) = hex_to_abs_coords($bx, $by);
		    $f = 1;
		    # go through all coords of biopsies
		    for my $i (1..$biopsy_count){
			if(abs($bx-$biopsy_coords{$i}{"x"})<=$biopsy_size_x &&
			   abs($by-$biopsy_coords{$i}{"y"})<=$biopsy_size_y ){
			    # failed, re-choose a biopsy coordinates
			    $f = 0;
			    last;
			}
		    }
		    if($f==1){
			# add biopsy
			$biopsy_count++;
			$biopsy_coords{$biopsy_count}{"x"}=$bx;
			$biopsy_coords{$biopsy_count}{"y"}=$by;
		    }
		}
		
		# Evenly distribute crypts in biopsies
		my %number_of_crypts_per_biopsy = ();
		for my $b (1..$biopsy){
		    $number_of_crypts_per_biopsy{$b} = int($crypt/$biopsy); 
		}
		for my $b (1..($crypt%$biopsy)){
		    $number_of_crypts_per_biopsy{$b} += 1;
		}
		
		%indexx=();
		%indexy=();
		my $i=0;

		# Go through each biopsy and draw crypts
		for my $b (1..$biopsy){
		    my %cryptshash = draw_random_locations($number_of_crypts_per_biopsy{$b},
			$biopsy_coords{$b}{"x"}, $biopsy_coords{$b}{"x"}+$biopsy_size_x,
			$biopsy_coords{$b}{"y"}, $biopsy_coords{$b}{"y"}+$biopsy_size_y);
		    # Fill up the list of cells		   
		    for my $kx (keys %cryptshash){
			for my $ky (keys %{$cryptshash{$kx}}){
			    $indexx{$i}=$kx;
			    $indexy{$i}=$ky;
			    $i++;	    
			}
		    }    		    
		}

		# Compute divergence
		my ($div,%divpair) = compute_divergence($i,\%grid,\%patterns,\%indexx,\%indexy);
		$rounded = sprintf("%.6f", $div);	    
		print out "$replicate,$biopsy,$crypt,$rounded\n";

		# Log locations of picked cells
		print out2 "$replicate,$biopsy,$crypt,$rounded";
		for my $loc (0..$i-1){
		    print out2 ",".$indexx{$loc}.",".$indexy{$loc};
		}
		print out2 "\n";
		
		# Log the distograms
		open (out3, ">$name$sw/$name$sw.biopsy.sampling.distogram.$replicate.$biopsy.$crypt.log");
		for my $comparison (keys %divpair){
		    my $genetic_distance = sprintf("%.6f", $divpair{$comparison}{"gd"});	    
		    my $spatial_distance = sprintf("%.1f", $divpair{$comparison}{"sd"});
		    print out3 $spatial_distance.",".$genetic_distance."\n";
		}
		close out3;
	    }
	}
    }    
    close out2;
    close out;
    print STDERR "Sim $sw done\n";
}

    
=pod
# Create allele dropouts
%new_patterns;

# Per panel 60% of 244 loci amplify
my %rem_hash = draw_rand_hash(147,244);
my %index_rem_hash, $ind=0;
for my $key (keys %rem_hash) {
    $index_rem_hash{$key} = $ind;
    $ind++;
}

for my $i (0..($n-1)){
    my $cid = $grid{$indexx{$i}}{$indexy{$i}};
    # Per individual cell, 60% of 147 loci amplify
    my %final_hash = draw_rand_hash(88,147);
    for my $loc (0..243){
	$new_patterns{$cid}{$loc} = $patterns{$cid}{$loc};
	if(!$rem_hash{$loc}){
	    # Globally did not amplify
	    $new_patterns{$cid}{$loc} = -1;
	}
	if(!$final_hash{$index_rem_hash{$loc}}){
	    # Did not amplify in this cell
	    $new_patterns{$cid}{$loc} = -1;
	}
    }
}
=cut


sub compute_divergence{
    # Number of cells
    my ($n,$gridRef,$patternsRef,$indexxRef,$indexyRef) = @_;
    my %grid = %$gridRef;
    my %patterns = %$patternsRef;
    my %indexx = %$indexxRef;
    my %indexy = %$indexyRef;
    my %divergenceofpair = ();

    my $divergence = 0.0;
    my $dead_cells_count = 0;

    my $comparison_count = 0;
    # Compare pattern of cell i to patterns of cell j
    # A total of n*(n-1)/2 comparisons 
    for my $i (0..($n-2)){
	my @col;
	# get the clone_id
	my $cid = $grid{$indexx{$i}}{$indexy{$i}};

	# Skip comparing dead cells
	if($cid==-1){
	    $dead_cells_count++;
	}
	# count if last cell is dead
	if($i==$n-2 && $grid{$indexx{$i+1}}{$indexy{$i+1}}==-1){
	    $dead_cells_count++;
	}

	if($cid!=-1){

	    for my $j (($i+1)..($n-1)){
		my $cid2 = $grid{$indexx{$j}}{$indexy{$j}};

		# Speedup: if cid2 is a dead cell, skip it.
		if($cid2!=-1){
		    # Significant speedup: If the two chosen cells are the same clone
		    # then divergence is 0 between them, thus skip looping over
		    # their microsattelite patterns
		    # Only when they are different clones, compare their pattern
		    my $hamming = 0;
		    if($cid!=$cid2){
						
			# Convert pattern
			my @cell_one = split(/\,/,$patterns{$cid});
			my @cell_two = split(/\,/,$patterns{$cid2});
			
			my %h1 = ();
			my %h2 = ();

			for my $crnch (1..$#cell_one){
			    if($crnch%2==1){
				# Hash locus, value
				$h1{$cell_one[$crnch]} = $cell_one[$crnch+1];
			    }
			}
			for my $crnch (1..$#cell_two){
			    if($crnch%2==1){
				# Hash locus, value
				$h2{$cell_two[$crnch]} = $cell_two[$crnch+1];
			    }
			}

			# Loop over cell one hashed pattern
			while (($locus, $value) = each(%h1)) {
			    #print "$cid $locus $value";
			    # If the locus is present in cell two
			    if(defined($h2{$locus})){
				$hamming += abs($h1{$locus}-$h2{$locus});			    
			    } else {
				$hamming += abs($h1{$locus});
			    }
			}
			#print "Cell TWO:\n";
			# Loop over cell two hashed pattern
			while (($locus, $value) = each(%h2)) {
			    #print "$cid2 $locus $value";
			    # If the locus is present in cell one, do nothing, if absent, compare to 100
			    if(!defined($h1{$locus})){			    
				$hamming += abs($h2{$locus});			 
			    }
			}
			# The rest of the loci are all 100s, so hamming distance is 0.
			# The distance between two cells is hamming/#of valid comparisons
			$divergence += $hamming*1.0/($locitot+1);
			
		    }
		    # Save divergence of the pair of cells whether or not they are from the same clone
		    # Set divergence of the chosen two cells to -1 (comparing dead cells)
		    $divergenceofpair{$comparison_count}{"gd"} = $hamming*1.0/($locitot+1);
		    my ($ax, $ay) = hex_to_abs_coords($indexx{$i}, $indexy{$i});
		    my ($bx, $by) = hex_to_abs_coords($indexx{$j}, $indexy{$j});
		    # Distance base on wrap around hex grid
		    my $xd = min ($grid_size_x-abs($ax-$bx), abs($ax-$bx));
		    my $yd = abs($ay-$by);
		    $divergenceofpair{$comparison_count}{"sd"} = sqrt($xd*$xd+$yd*$yd);
		    $comparison_count++;
		}
	    }
	}
	#print "Comparing cell $i\n";
    }
    #print $n."\n";
    # At the end divide by the total comparisons between cells
    # subtract dead cells count from computation
    # What if all cells that we picked are dead? Then, set divergence to 0.
    if($dead_cells_count<$n-1){
	$divergence = $divergence/(($n-$dead_cells_count)*($n-$dead_cells_count-1)/2.0);
    } else {
	$divergence = 0.0;
    }
    return ($divergence,%divergenceofpair);
}

